From cf062e6040afbf79dea5e8345125d39471497100 Mon Sep 17 00:00:00 2001 From: Daniel Sabo Date: Mon, 10 Jun 2013 16:48:36 -0700 Subject: [PATCH] Add SSE2 float -> byte conversions --- extensions/Makefile.am | 3 + extensions/sse2-int8.c | 280 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 283 insertions(+) create mode 100644 extensions/sse2-int8.c diff --git a/extensions/Makefile.am b/extensions/Makefile.am index c68f075..f665418 100644 --- a/extensions/Makefile.am +++ b/extensions/Makefile.am @@ -29,6 +29,7 @@ ext_LTLIBRARIES = \ simple.la \ sse-fixups.la \ sse2-float.la \ + sse2-int8.la \ sse2-int16.la \ two-table.la @@ -44,6 +45,7 @@ naive_CMYK_la_SOURCES = naive-CMYK.c HSV_la_SOURCES = HSV.c sse_fixups_la_SOURCES = sse-fixups.c sse2_float_la_SOURCES = sse2-float.c +sse2_int8_la_SOURCES = sse2-int8.c sse2_int16_la_SOURCES = sse2-int16.c two_table_la_SOURCES = two-table.c two-table-tables.h float_la_SOURCES = float.c @@ -53,4 +55,5 @@ LIBS = $(top_builddir)/babl/libbabl-@BABL_API_VERSION@.la $(MATH_LIB) sse_fixups_la_CFLAGS = $(SSE_EXTRA_CFLAGS) sse2_float_la_CFLAGS = $(SSE2_EXTRA_CFLAGS) +sse2_int8_la_CFLAGS = $(SSE2_EXTRA_CFLAGS) sse2_int16_la_CFLAGS = $(SSE2_EXTRA_CFLAGS) diff --git a/extensions/sse2-int8.c b/extensions/sse2-int8.c new file mode 100644 index 0000000..67ff579 --- /dev/null +++ b/extensions/sse2-int8.c @@ -0,0 +1,280 @@ +/* babl - dynamically extendable universal pixel conversion library. + * Copyright (C) 2013 Daniel Sabo + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General + * Public License along with this library; if not, see + * . + */ + +#include "config.h" + +#if defined(USE_SSE2) + +/* SSE 2 */ +#include + +#include +#include + +#include "babl.h" +#include "babl-cpuaccel.h" +#include "extensions/util.h" + +static inline long +conv_yF_y8 (const float *src, uint8_t *dst, long samples) +{ + const __v4sf *s_vec; + __m128i *d_vec; + uint32_t *d_int; + + long n = samples; + + const __v4sf byte_fill = _mm_set_ps1(255.0f); + const __v4sf half = _mm_set_ps1(0.5); + + while (((uintptr_t)src % 16) && n > 0) + { + /* Work through the unaligned floats */ + float y = *src++; + *dst++ = (y >= 1.0f) ? 0xFF : ((y <= 0.0f) ? 0x0 : 0xFF * y + 0.5f); + + n -= 1; + } + + s_vec = (__v4sf *)src; + d_vec = (__m128i *)dst; + + /* Aligned chunks */ + + while (n > 16) + { + __v4sf yyyy0, yyyy1, yyyy2, yyyy3; + __m128i i32_0, i32_1, i32_2, i32_3; + __m128i i16_01, i16_23; + __m128i mm_ints; + + /* Add 0.5 and truncate, to match C rounding behavior. + * + * The _mm_min_ps is needed because _mm_packs_epi32 uses + * signed saturation, the unsigned version wasn't added + * until SSE4. + */ + yyyy0 = *s_vec++ * byte_fill + half; + yyyy0 = _mm_min_ps(yyyy0, byte_fill); + i32_0 = _mm_cvttps_epi32 ((__m128)yyyy0); + + yyyy1 = *s_vec++ * byte_fill + half; + yyyy1 = _mm_min_ps(yyyy1, byte_fill); + i32_1 = _mm_cvttps_epi32 ((__m128)yyyy1); + + i16_01 = _mm_packs_epi32 (i32_0, i32_1); + + yyyy2 = *s_vec++ * byte_fill + half; + yyyy2 = _mm_min_ps(yyyy2, byte_fill); + i32_2 = _mm_cvttps_epi32 ((__m128)yyyy2); + + yyyy3 = *s_vec++ * byte_fill + half; + yyyy3 = _mm_min_ps(yyyy3, byte_fill); + i32_3 = _mm_cvttps_epi32 ((__m128)yyyy3); + + i16_23 = _mm_packs_epi32 (i32_2, i32_3); + + mm_ints = _mm_packus_epi16 (i16_01, i16_23); + + _mm_storeu_si128 (d_vec++, mm_ints); + + n -= 16; + } + + d_int = (uint32_t *)d_vec; + + while (n > 4) + { + __v4sf yyyy0; + __m128i mm_ints; + + yyyy0 = *s_vec++ * byte_fill + half; + yyyy0 = _mm_min_ps(yyyy0, byte_fill); + mm_ints = _mm_cvttps_epi32 ((__m128)yyyy0); + mm_ints = _mm_packs_epi32 (mm_ints, mm_ints); + mm_ints = _mm_packus_epi16 (mm_ints, mm_ints); + _mm_store_ss ((float *)d_int++, (__v4sf)mm_ints); + + n -= 4; + } + + src = (float *)s_vec; + dst = (uint8_t *)d_int; + + while (n > 0) + { + float y = *src++; + *dst++ = (y >= 1.0f) ? 0xFF : ((y <= 0.0f) ? 0x0 : 0xFF * y + 0.5f); + + n -= 1; + } + + return samples; +} + +static long +conv_yaF_ya8 (const float *src, uint8_t *dst, long samples) +{ + return conv_yF_y8 (src, dst, samples * 2) / 2; +} + + +static long +conv_rgbF_rgb8 (const float *src, uint8_t *dst, long samples) +{ + return conv_yF_y8 (src, dst, samples * 3) / 3; +} + +static long +conv_rgbaF_rgba8 (const float *src, uint8_t *dst, long samples) +{ + return conv_yF_y8 (src, dst, samples * 4) / 4; +} + +#endif + +int init (void); + +int +init (void) +{ +#if defined(USE_SSE2) + const Babl *rgbaF_linear = babl_format_new ( + babl_model ("RGBA"), + babl_type ("float"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + babl_component ("A"), + NULL); + const Babl *rgba8_linear = babl_format_new ( + babl_model ("RGBA"), + babl_type ("u8"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + babl_component ("A"), + NULL); + const Babl *rgbaF_gamma = babl_format_new ( + babl_model ("R'G'B'A"), + babl_type ("float"), + babl_component ("R'"), + babl_component ("G'"), + babl_component ("B'"), + babl_component ("A"), + NULL); + const Babl *rgba8_gamma = babl_format_new ( + babl_model ("R'G'B'A"), + babl_type ("u8"), + babl_component ("R'"), + babl_component ("G'"), + babl_component ("B'"), + babl_component ("A"), + NULL); + const Babl *rgbF_linear = babl_format_new ( + babl_model ("RGB"), + babl_type ("float"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + NULL); + const Babl *rgb8_linear = babl_format_new ( + babl_model ("RGB"), + babl_type ("u8"), + babl_component ("R"), + babl_component ("G"), + babl_component ("B"), + NULL); + const Babl *rgbF_gamma = babl_format_new ( + babl_model ("R'G'B'"), + babl_type ("float"), + babl_component ("R'"), + babl_component ("G'"), + babl_component ("B'"), + NULL); + const Babl *rgb8_gamma = babl_format_new ( + babl_model ("R'G'B'"), + babl_type ("u8"), + babl_component ("R'"), + babl_component ("G'"), + babl_component ("B'"), + NULL); + const Babl *yaF_linear = babl_format_new ( + babl_model ("YA"), + babl_type ("float"), + babl_component ("Y"), + babl_component ("A"), + NULL); + const Babl *ya8_linear = babl_format_new ( + babl_model ("YA"), + babl_type ("u8"), + babl_component ("Y"), + babl_component ("A"), + NULL); + const Babl *yaF_gamma = babl_format_new ( + babl_model ("Y'A"), + babl_type ("float"), + babl_component ("Y'"), + babl_component ("A"), + NULL); + const Babl *ya8_gamma = babl_format_new ( + babl_model ("Y'A"), + babl_type ("u8"), + babl_component ("Y'"), + babl_component ("A"), + NULL); + const Babl *yF_linear = babl_format_new ( + babl_model ("Y"), + babl_type ("float"), + babl_component ("Y"), + NULL); + const Babl *y8_linear = babl_format_new ( + babl_model ("Y"), + babl_type ("u8"), + babl_component ("Y"), + NULL); + const Babl *yF_gamma = babl_format_new ( + babl_model ("Y'"), + babl_type ("float"), + babl_component ("Y'"), + NULL); + const Babl *y8_gamma = babl_format_new ( + babl_model ("Y'"), + babl_type ("u8"), + babl_component ("Y'"), + NULL); + +#define CONV(src, dst) \ +{ \ + babl_conversion_new (src ## _linear, dst ## _linear, "linear", conv_ ## src ## _ ## dst, NULL); \ + babl_conversion_new (src ## _gamma, dst ## _gamma, "linear", conv_ ## src ## _ ## dst, NULL); \ +} + + if ((babl_cpu_accel_get_support () & BABL_CPU_ACCEL_X86_SSE2)) + { + CONV(rgbaF, rgba8); + CONV(rgbF, rgb8); + CONV(yaF, ya8); + CONV(yF, y8); + } + +#endif + + return 0; +} + -- 2.30.2